{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "import sys"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "plt.rc('figure', figsize=(10, 6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.chdir('../00_data/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "atec_anti_fraud_test_b.csv       df_50w.csv        submit_0212C.csv\n",
      "atec_anti_fraud_test_b_demo.csv  df_test_50w.csv   submit_0212D.csv\n",
      "atec_anti_fraud_train.csv        df_test.zip       train_head_10w.csv\n",
      "atec_anti_fraud_train_demo.csv   submit_0211A.csv  train_modified.csv\n",
      "df_10w.csv                       submit_0212A.csv\n",
      "df_1m.csv                        submit_0212B.csv\n"
     ]
    }
   ],
   "source": [
    "ls"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load data and split train/val"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 设置抑制warning 输出\n",
    "from sklearn.exceptions import DataConversionWarning\n",
    "import warnings\n",
    "warnings.filterwarnings(action='ignore', category=DataConversionWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = pd.read_csv('./df_10w.csv')\n",
    "# X = df.iloc[:,3:]  # start from f1\n",
    "# y = df.iloc[:,1]     # label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分训练和测试集\n",
    "# test_ratio = 0.2   # 根据样本总量调整\n",
    "# X_train, X_test, y_train, y_test = \\\n",
    "#     train_test_split(X, y, test_size=test_ratio, \n",
    "#                      stratify=y, random_state=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# PCA\n",
    "\n",
    "- 利用PCA对数据降维，get_pca_order输出需要保留的主成分阶数\n",
    "- 试用效果不好，最终未采用"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.preprocessing import StandardScaler\n",
    "# from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_pca_order(X):\n",
    "#     # 计算累计方差0.90 0.95 对应的pca阶数\n",
    "#     sc = StandardScaler()\n",
    "#     pca = PCA()\n",
    "#     X_std = sc.fit_transform(X)\n",
    "#     X_pca = pca.fit_transform(X_std)\n",
    "#     pca_sum = np.cumsum(pca.explained_variance_ratio_)\n",
    "\n",
    "#     print('前{}阶主成分,累计解释方差={}'.format(sum(pca_sum < 0.9),0.9))\n",
    "#     print('前{}阶主成分,累计解释方差={}'.format(sum(pca_sum < 0.95),0.95))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get_pca_order(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# GBDT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn import metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 基本实现\n",
    "# pipe_gb = make_pipeline(StandardScaler(),\n",
    "#                         GradientBoostingClassifier(random_state=10))\n",
    "# pipe_gb.fit(X_train, y_train)\n",
    "\n",
    "# y_train_pred = pipe_gb.predict(X_train)\n",
    "# y_test_pred = pipe_gb.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model evaluation util"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import roc_curve, auc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "def show_metric(X_data, y_gt, pipe):\n",
    "    # 输出metric 评估结果\n",
    "    y_pred = pipe_gb.predict(X_data)\n",
    "    acc = pipe.score(X_data, y_gt)\n",
    "    recall = metrics.recall_score(y_gt.values, y_pred,  average=None)\n",
    "    cnf_mat = metrics.confusion_matrix(y_gt.values, y_pred)\n",
    "    f1_score = metrics.f1_score(y_gt.values, y_pred)\n",
    "    \n",
    "    print('metrcis：')\n",
    "    print('Accuracy: %.3f' % acc)\n",
    "    print('Recall of classes: ', recall)\n",
    "    print('f1 score \\t: ', f1_score)\n",
    "    print('confusion matrix:')\n",
    "    print(cnf_mat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tpr(X_data, y_gt, pipe):\n",
    "    # 切阈值方法\n",
    "    # TPR1：当FPR等于0.001时的TPR\n",
    "    # TPR2：当FPR等于0.005时的TPR\n",
    "    # TPR3：当FPR等于0.01时的TPR\n",
    "    # 模型成绩 = 0.4 * TPR1 + 0.3 * TPR2 + 0.3 * TPR3\n",
    "    \n",
    "    probas = pipe.predict_proba(X_data)\n",
    "    fpr, tpr, thresholds = roc_curve(y_gt, probas[:, 1],pos_label=1)\n",
    "\n",
    "    fpr_dic = {'r1':0.001, 'r2':0.005, 'r3':0.01}\n",
    "    tpr_dic = {}\n",
    "    \n",
    "    for k, fpr_thres in fpr_dic.items():\n",
    "        index = np.argmin(abs(fpr - fpr_thres))\n",
    "        tpr_dic[k] = tpr[index]\n",
    "\n",
    "    final_score = 0.4*tpr_dic['r1'] + 0.3*tpr_dic['r2'] + 0.3*tpr_dic['r3']\n",
    "\n",
    "    print('tpr at each level: ')\n",
    "    print(tpr_dic)\n",
    "    print('final score \\t: {} '.format(final_score))\n",
    "    return tpr_dic, final_score"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Grid Search\n",
    "\n",
    "- 用10w训练数据进行Grid Search\n",
    "- Search参数：max_depth n_estimators"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import GridSearchCV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all = pd.read_csv('./df_1m.csv') # 训练全集\n",
    "df = df_all.sample(100000, random_state=0, replace=False) # 选取10w子集\n",
    "del df_all  # 释放内存"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分训练和测试集\n",
    "X_train, X_test, y_train, y_test = \\\n",
    "    train_test_split(df.iloc[:,3:], \n",
    "                     df.iloc[:,1], \n",
    "                     test_size=0.3, \n",
    "                     stratify=y, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# grid search 不使用pipeline，此处需要先计算X的标准化值\n",
    "sc = StandardScaler()\n",
    "X_train_std = sc.fit_transform(X_train)\n",
    "X_test_std = sc.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "param_grid = {'n_estimators':range(100,201,50), # 取值 100 150 200\n",
    "              'max_depth': range(3,7,1)} # 取值3 4 5 6，总计12种组合\n",
    "gs = GridSearchCV(\n",
    "  estimator=GradientBoostingClassifier(random_state=10), \n",
    "                  param_grid=param_grid, \n",
    "                  scoring='f1',   # 参考f1指标选优\n",
    "                  cv=3,           # 3折交叉验证\n",
    "                  n_jobs=-1)      # 设置多线程训练"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "gs = gs.fit(X_train_std, y_train)\n",
    "print(gs.best_score_)\n",
    "print(gs.best_params_)  # 输出最优参数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 正式训练\n",
    "\n",
    "- 用更大的训练集训练模型\n",
    "- 输出模型的性能指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('./df_1m.csv') # 训练全集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分训练和测试集\n",
    "X_train, X_test, y_train, y_test = \\\n",
    "    train_test_split(df.iloc[:,3:], \n",
    "                     df.iloc[:,1], \n",
    "                     test_size=0.3, \n",
    "                     stratify=y, random_state=0)\n",
    "del df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 此处参数替换为搜索的最优参数\n",
    "pipe_gb = make_pipeline(StandardScaler(),\n",
    "                        GradientBoostingClassifier(max_depth=4,\n",
    "                                                   n_estimators=200， \n",
    "                                                   tol=0.0001,\n",
    "                                                   verbose=10, # 输出详细信息\n",
    "                                                   random_state=10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe_gb.fit(X_train, y_train)  # 模型训练"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出分类指标"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train_pred = pipe_gb.predict(X_train)  # 计算预测值\n",
    "show_metric(X_train, y_train, pipe_gb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test_pred = pipe_gb.predict(X_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出比赛score\n",
    "\n",
    "- 对比训练集和验证集的score可以判断过拟合的程度"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = get_tpr(X_train, y_train, pipe_gb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "p = get_tpr(X_test, y_test, pipe_gb)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Infer test dataset\n",
    "\n",
    "- 用GBDT模型对test数据预测\n",
    "- 输出可提交格式的csv文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test = pd.read_csv('./df_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "def infer_testset_save(df):\n",
    "    # 读取测试集，用模型infer概率，输出提交结果\n",
    "    test_dim = df.shape[0]\n",
    "    X_test2 = df.iloc[:,2:]\n",
    "    test_score = pipe_gb.predict_proba(X_test2)\n",
    "    assert(test_score.shape[0] == test_dim)\n",
    "\n",
    "    df_submit = pd.DataFrame()\n",
    "    df_submit['id'] = df_test.loc[:,'id']\n",
    "    df_submit['score'] = test_score[:,1]\n",
    "\n",
    "    assert(df_submit.shape[0] == test_dim)\n",
    "    return df_submit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submit = infer_testset_save(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_submit.to_csv('./submit_0212A.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
